function data_set = create_interaction_variables(data_set,vars,range_nways,separator,max_varname_length)
% create_interaction_variables
% 
% for Matlab R13+
% version 1.1 (April 2012)
% (c) Brian Weidenbaum
% website: http://www.BrianWeidenbaum.com/.
%
%
% OUTPUT: your dataset (or a dataset based on your matrix), updated with new, aptly-named *unique* interaction variables,
% ranging from at least 2 to any number the user specifies
%
% INPUTS (** = OPTIONAL)
% input name: (input datatype/s) -- description
% 
% data_set: (dataset OR matrix) -- the data you want to alter
%
% **vars: (cell array of chars/numbers, OR vector of numbers, OR 'ALL') --
% default: 'ALL'
% the names of the variables you want to interact; alternatively, the column numbers of the variables you want to interact
% OR, you can just say 'ALL' to include all variables automatically
%
% **range_nways (vector OR 'MAX') -- 
% default: 2
% the range of numbers of variables to include in the interaction terms generated by this function
% alternatively, just type 'MAX' to use 2 variables to the maximum possible number of vars
%
% **separator (char) -- 
% default: '_'
% the separator string you want to use to divide the
% variable names that contributed to a new interaction variable.  Default
% is '_'.  E.g., by default, an interaction between Var1 and Var2 will be
% named 'Var1_Var2'.  
%
% **max_varname_length (number) -- 
% default: 63
% the maximum length of the newly created interaction terms' variable names.  
% Any dynamically generated variable names (e.g. 'Var1_Var2') that exceed this number will be excluded from the new dataset.
% You should set max_varname_length according to the database you plan to use with your data-- 
% e.g., if you only want to use this data in MATLAB, you should set max_varname_length=63 (the maximum length supported by the dataset class, 
% but if you plan to export your data to Oracle, you should set it to around 30 
%
%
%
% EXAMPLES
%
% You have a dataset with 3 variables: a, b, and c.  
% You want to create interaction terms, up to 3 ways, for all of your variables.
% You type:
% new_dataset = create_interaction_variables(old_dataset,'all','max');
% new_dataset will contain:
% a.*b, named 'a_b'
% a.*c, named 'a_c'
% b.*c, named 'b_c'
% and a.*b.*c, named 'a_b_c',
% plus all original variables.
% It will NOT contain b.*a, c.*a, etc because these are not unique combos.
% 
% You have a dataset with 3 variables: a, b, and c.  
% You want to create interaction terms, up to 2 ways, only for columns 2 and 3.
% You type:
% new_dataset = create_interaction_variables(old_dataset,[2 3],2);
% new_dataset will contain only b.*c,
% plus all original variables.
% 
% You have a dataset with 3 variables: a, b, and c.  
% You want to create interaction terms, up to 2 ways, only for 'a' and 'c'.
% You type:
% new_dataset = create_interaction_variables(old_dataset,{'a','c'},2);
% new_dataset will contain only a.*c,
% plus all original variables.
% 
% You have a dataset with 3 variables: a, b, and c.  
% You want to create interaction terms, ONLY 3 ways, only for all vars.
% You type:
% new_dataset = create_interaction_variables(old_dataset,'all',3);
% new_dataset will contain only a .* b .* c,
% plus all original variables.
% 
% 
% 
% CHANGE LOG
% Changes between 1.0 and 1.1: 
% -added 'separator' parameter, enabling users to create custom named vars
% -changed 'max_nways' parameter to 'range_nways', giving user more control
% over the types of interactions created
% -enforced maximum max_varname_length as 63
% -reduced min number of arguments to 1, setting defaults for other 4 params
% -minor performance tweaks via vectorizing some inner function loops
% 
% 


	%PHASE ONE: USER INPUT VALIDATION
    
    % set default values for all params except data_set
    if nargin==1
       vars='all';
       range_nways=2;
       separator='_';
       max_varname_length = 63; %max length supported by MATLAB dataset class   
    elseif nargin==2
       range_nways=2;
       separator='_';
       max_varname_length = 63; 
    elseif nargin==3
        separator='_';
        max_varname_length = 63;    
    elseif nargin==4
        max_varname_length = 63;    
    end
    
    %if data_set param is a regular matrix, convert it to a dataset
	%Otherwise, user will be unable to know which columns correspond to which interactions
	if ~strcmp(class(data_set),'dataset')
	   data_set = matrix2dataset(data_set);
	   %this data_set will include default variable names, like 'Var1', 'Var2', etc
    end
    
    %check if user input 'all' or 'ALL' for vars param
    %if so, convert vars = data_set.Properties.VarNames;
    if ischar(vars)
        if strcmpi(vars,'ALL')
            vars = data_set.Properties.VarNames;
        else
            error('If you are using a char array as your vars parameter, you can only set it as ''all'' or ''ALL''. ');
        end
    end

    nvars = length(vars);
    if nvars<2
       error('You must specify at least 2 variable names/column numbers inside the vars parameter.'); 
    end
    
    %validate range_nways
    if ischar(range_nways)        
        range_nways = 2:nvars; 
    elseif isnumeric(range_nways)
        if length(range_nways)<1
           error('The max_nways parameter should contain at least one number.'); 
        end
        % 2 <= max vars per interaction <= total number of variable names passed in
        % set any nways <2 =2 or >nvars = nvwars and drop all but one
        % cannot have any repeating numbers, so in end just replace with unique
        if sum(range_nways<2)>0
           disp(['At least one number in range_nways is < 2.' ...
               '  Since you must have at least 2 variables per interaction, all numbers < 2 have been replaced with 2.']); 
        end
        range_nways(range_nways<2)=2;
         if sum(range_nways>nvars)>0
            disp(['At least one number in range_nways is > ' num2str(nvars)...
                ' (the total number of variables you are trying to create interactions from).' ...
                '  All such numbers will be dropped from range_nways and replaced with a single ' num2str(nvars) '.']); 
         end
        range_nways(range_nways>nvars)=nvars;
        range_nways=unique(range_nways);
        %force rnways to be a row vector
        range_nways = reshape(range_nways,1,length(range_nways));
    else
        error('range_nways parameter MUST be a vector of numbers, or ''ALL''.');
    end
	
	%check if each variable name/column number in vars param exists inside the dataset
	%also, if necessary, convert column numbers to variable names, for future use with eval strings
    if iscell(vars)
      if ~ischar(vars{1}) %if the varnames are NOT chars
        vars = columns2names(vars);
      else %vars are chars, so check if each name is member of vars 
            for i=1:length(vars)
                if ~ismember(vars{i},data_set.Properties.VarNames)
                    disp([vars{i} ' is not a valid column and will not be included in the interactions.' ]);
                    nvars=nvars-1;
                end                 
            end      
            vars(~ismember(vars,data_set.Properties.VarNames))=[];
      end
    else %its for sure a vector of numbers
        vars = columns2names(vars);
    end 
    
    %validate separator
    if ~ischar(separator)
       error('Seperator needs to be a char array.');
    elseif length(separator)>60
        error('Your separator term is too long.');
    elseif ismember('%',separator)
        error('Your separator contains one or more illegal characters.');
    end
    
    %validate max_varname_length
    if max_varname_length>63
       error('Max varname length exceeds 63, which is the highest number supported by the MATLAB dataset class.'); 
    end
    
	%END INPUT VALIDATION
    
	
	%PHASE TWO: CREATE ALL THE INTERACTION VARIABLES
	all_evalstrs = get_evalstrs;
	for i=1:length(all_evalstrs)
	%trycatch prevents multiplying chars by numbers
		try
			eval(all_evalstrs{i}); 
		catch e
            disp(e.message);
		end
    end


    %PHASE THREE: PROFIT

    
	%
	%
	% INNER FUNCTIONS
	%
	%
	
	%inner function that translates varname INDICES to actual varnames cellarray of chars
	%for use in eval() function
    function char_varnames = columns2names(columns)
        %first detect if the columns are a cell array; if so, xform to vector
        if iscell(columns)
            columns = cell2mat(columns);
        end 

        %check if all column numbers are between 1 and total nvars in data_set
        for ii=1:length(columns)
         if ((columns(ii)<1) || (columns(ii)>size(data_set,2)))
            disp(['Column number ' num2str(columns(ii)) ' is not a valid column and will not be included in the interactions.' ]); 
            nvars=nvars-1;
         end     
        end
        columns(columns(:)<1 | columns(:)>size(data_set,2))=[];      
        char_varnames = data_set.Properties.VarNames(columns);
    end %inner function


	% inner function that creates all UNIQUE combinations of indices for 2:maxvars_perinteraction
	% adds them all to a cell array
    function all_indices = get_interaction_indices()
        all_indices={};
        ct=0;
        %for each number of vars per interaction...
        %get all unique combos of indices
        %e.g. if 3 vars, and 3 max vars per interaction
        %first, get 2 var combos:
        %12,13,23
        %then get 3 var combo: 123
        %add all that to cell array and return it
        for nways = range_nways

            %need to dynamically create allcomb args
            %if it's choose2, there are 2 args: 1:nvars,1:nvars
            %if it's choose3, there are 3 args: 1:nvars,1:nvars,1:nvars ...etc
            allcomb_argstr = repmat('1:nvars,',1,nways);
            %remove final ','
            allcomb_argstr  = allcomb_argstr(1:end-1);
            
            %allcomb returns cartesian product
            all_combos = eval(['allcomb(' allcomb_argstr ');']);
            
            %now choose all appropriate combos
            %so we only have unique combinations (eg not 2 1 and 1 2)
            logical_str='';            
            for choose_columns= nways:-1:2
               logical_str=[logical_str, '(all_combos(:,' , num2str(choose_columns) , ')>all_combos(:,' , num2str(choose_columns-1) , '))&' ]; 
            end
            %remove final '&'
            logical_str = logical_str(1:end-1);

            % below gives the combo indices
            good_combos = all_combos(eval(logical_str),:);
            
            %now add each row in good_combos to all_indices
            ngood = size(good_combos,1);            
            for ii=1:ngood
                ct =ct+1;
                all_indices{ct} = good_combos(ii,:);
            end %adding rows to all_indices            
        end %loop that goes thru each possible number of vars per interaction         
    end %get_interaction_indices inner fx


    % inner function that gets the 'data_set.var1_var2_varN=data_set.var1.*data_set.var2.*datset.varN' version of indices
    function str = indices2str(indices)
        % goal: if give two indices [1 2], return the following string:
        %   'data_set.var1_var2 = data_set.var1.*data_set.var2;'
        
        n_indices = length(indices);
        %left of equals sign should be: 'data_set.var1_var2'
        left_str = 'data_set.'; 
        
        %add each varname separated by _
        for i=1:n_indices
            left_str = [left_str vars{indices(i)} separator]; %seperator var
        end %building varname string            
        %remove the final separator
        left_str = left_str(1:end-length(separator));
        %check if varname is too big
        if length(left_str)-length('data_set.') > max_varname_length
           error(['Name of new variable: ''' left_str(length('data_set.')+1:end) ...
               ''' exceeded max varname length of ' num2str(max_varname_length) ', and will not be included in the new dataset.']); 
        end
        
        %now for the righthand side of equals, which should be:
        %'data_set.var1.*data_set.var2;'
        right_str = '';
        for i=1:n_indices
            right_str = [right_str 'data_set.' vars{indices(i)} '.*'];
        end %building varname string
        %removing the final '.*'
        right_str=[right_str(1:end-2) ';'];
        
        str= [left_str '=' right_str];        
    end %indices2str inner fx

	
    %will give cellarray where each cell contains a good evalstr
    function all_evalstrs = get_evalstrs
        %first get all the indices
        inds = get_interaction_indices;
        ncombos= length(inds);
        all_evalstrs = cell(ncombos,1);
        
        %go thru each combo of indices and try to create an eval str
        %need try catch bc of potential max varname length problems
        for ii = 1:ncombos
            try
                all_evalstrs{ii}=indices2str(inds{:,ii});
            catch e
                all_evalstrs{ii}='';
                disp(e.message);
            end
        end         
    end %get all evalstrs


    function A = allcomb(varargin)
        % ALLCOMB - All combinations
        %    B = ALLCOMB(A1,A2,A3,...,AN) returns all combinations of the elements
        %    in A1, A2, ..., and AN. B is P-by-N matrix is which P is the product
        %    of the number of elements of the N inputs.
        %    Empty inputs yields an empty matrix B of size 0-by-N. Note that
        %    previous versions (1.x) simply ignored empty inputs.
        %
        %    Example:
        %       allcomb([1 3 5],[-3 8],[0 1]) ;
        %         1  -3   0
        %         1  -3   1
        %         1   8   0
        %         ...
        %         5  -3   1
        %         5   8   0
        %         5   8   1
        %
        %    ALLCOMB(A1,..AN,'matlab') causes the first column to change fastest.
        %    This is more consistent with matlab indexing. Example:
        %    allcomb(1:2,3:4,5:6,'matlab') %->
        %      1   3   5
        %      2   3   5
        %      1   4   5
        %      ...
        %      2   4   6
        %
        %    This functionality is also known as the cartesian product.
        %
        %    See also NCHOOSEK, PERMS, NDGRID
        %    and COMBN, KTHCOMBN (Matlab Central FEX)

        % for Matlab R13+
        % version 2.2 (jan 2012)
        % (c) Jos van der Geest
        % email: jos@jasen.nl

        % History
        % 1.1 (feb 2006), removed minor bug when entering empty cell arrays;
        %     added option to let the first input run fastest (suggestion by JD)
        % 1.2 (jan 2010), using ii as an index on the left-hand for the multiple
        %     output by NDGRID. Thanks to Jan Simon, for showing this little trick
        % 2.0 (dec 2010). Bruno Luong convinced me that an empty input should
        % return an empty output.
        % 2.1 (feb 2011). A cell as input argument caused the check on the last
        %      argument (specifying the order) to crash.
        % 2.2 (jan 2012). removed a superfluous line of code (ischar(..))

        error(nargchk(1,Inf,nargin)) ;

        % check for empty inputs
        q = ~cellfun('isempty',varargin) ;
        if any(~q),
            warning('ALLCOMB:EmptyInput','Empty inputs result in an empty output.') ;
            A = zeros(0,nargin) ;
        else

            ni = sum(q) ;

            argn = varargin{end} ;

            if ischar(argn) && (strcmpi(argn,'matlab') || strcmpi(argn,'john')),
                % based on a suggestion by JD on the FEX
                ni = ni-1 ;
                ii = 1:ni ;
                q(end) = 0 ;
            else
                % enter arguments backwards, so last one (AN) is changing fastest
                ii = ni:-1:1 ;
            end

            if ni==0,
                A = [] ;
            else
                args = varargin(q) ;
                if ~all(cellfun('isclass',args,'double')),
                    error('All arguments should be arrays of doubles') ;
                end
                if ni==1,
                    A = args{1}(:) ;
                else
                    % flip using ii if last column is changing fastest
                    [A{ii}] = ndgrid(args{ii}) ;
                    % concatenate
                    A = reshape(cat(ni+1,A{:}),[],ni) ;
                end
            end
        end
    end %allcomb inner fx
	
	
	function output = matrix2dataset(matrix,ifx_varnames)
	% matrix2dataset
	%
	% INPUTS
	% matrix (N x M matrix): the matrix version of the data you want to convert
	% varnames (OPTIONAL cell vector of chars): the names of the columns, in order
	%
	% OUTPUT: 
	% a dataset that contains all the data from your matrix, named according to
	% your varnames
	% assumes that each column in matrix is one dataset column-- eg no dataset
	% column should consist of >1 matrix column
	%
	%
	% EXAMPLE:
	% This function is great for when you have converted an old dataset
	% into a matrix of doubles for performing some calculations, but now you
	% want to update your original dataset with new values or new columns that
	% you have created from your calculations.
	%
	% In that situation, you would type the following: 
	% updated_dataset = matrix2dataset(matrix_from_old_dataset, old_dataset.Properties.VarNames);  
	% 
	% If you don't have any varnames, you can choose not to use the varnames parameter 
	% In that case, you will get the default dataset names: Var1, Var2,..VarN
	%
		
		n_vars = size(matrix,2); 
		if nargin==2
			if ~(length(ifx_varnames)==n_vars)
				error('You must have one variable name for each column in the matrix parameter.');
			end
		end
		
		str_vars = '';    
		for ii=1:n_vars
		   str_vars = [str_vars, 'matrix(:,', num2str(ii), '),' ];
		end    
		
		if nargin == 2
			ds_arguments = [str_vars, '''VarNames'',','ifx_varnames'];
		elseif nargin==1
			ds_arguments = str_vars(1:end-1);
		end
		output = eval(['dataset(' ds_arguments ');']);
	end%innerfx
	
	%END OF INNER FUNCTIONS

end%main fx	